import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pylab import *
import warnings
warnings.filterwarnings('ignore')

all_df=pd.read_csv('train_titanic.csv')

import re
#储存正则表达式
regx=re.compile('(.*, )|(\\..*)')
def a(name):
    return re.sub(regx,'',name)
all_df['title']=all_df['Name'].map(a)
all_df['title_len']=all_df['title'].map(len)
print(all_df.head())
#用’1‘替换’6‘
# print(re.sub('6', '1', '1611616166161'))

#查看称呼分布
# print(all_df['title'].value_counts())

#同义称呼合并
# all_df.loc[all_df['title']=='Mlle','title']='Miss'
# all_df.loc[all_df['title']=='Ms','title']='Miss'
# all_df.loc[all_df['title']=='Mme','title']='Mrs'
# all_df.loc[all_df.title=='Mlle','title']='Miss'
# all_df.loc[all_df.title=='Ms','title']='Miss'
# all_df.loc[all_df.title=='Mme','title']='Mrs'
all_df.loc[(all_df.title=='Mlle')|(all_df.title=='Ms'),'title']='Miss'
all_df.loc[all_df['title']=='Mme','title']='Mrs'
print(all_df['title'].value_counts())
#将数目较少的称呼合并为稀有称呼 rare
common=['Mr','Miss','Mrs','Master']
all_df.loc[~all_df['title'].isin(common),'title']='rare'
